library(ggplot2)
library(ggrepel)
library(ggpubr)
library(stringr)
library(MASS)
library(RColorBrewer)
library(DESeq2)
library(viridis)
library(ggpointdensity)
library(dplyr)
library(data.table)
library(ComplexHeatmap)
library(UpSetR)
library(readxl)
theme_set(theme_classic())
path = "../../MorPhiC/data/June-2024/JAX_RNAseq_Neuroectoderm/"
dat_path = paste0(path, "processed/release110-gencode44/Tables")meta = fread("data/June_2024/JAX_RNAseq_Neuroectoderm_meta_data.tsv",
data.table = FALSE)
dim(meta)## [1] 174 36
meta[1:2,]## clonal.label library_preparation.label label
## 1 MOK20002-WT GT23-05635 CBO-MOK20002-WT-Day7
## 2 MOK20002-WT GT23-05625 CBO-MOK20002-WT-Day6
## description differentiated_product_protocol_id
## 1 KOLF2.2 derived cortical brain organoid JAXPD003
## 2 KOLF2.2 derived cortical brain organoid JAXPD003
## model_system timepoint_value timepoint_unit final_timepoint
## 1 CBO 7 days No
## 2 CBO 6 days No
## treatment_condition wt_control_status ko_strategy ko_gene
## 1 Not applicable WT WT WT
## 2 Not applicable WT WT WT
## library_preparation.description
## 1 NA
## 2 NA
## library_preparation.library_preparation_protocol_id
## 1 JAXPL001
## 2 JAXPL001
## library_preparation.average_fragment_size
## 1 410
## 2 425
## library_preparation.input_amount_value library_preparation.input_amount_unit
## 1 300 ng
## 2 300 ng
## library_preparation.concentration_value
## 1 51.8
## 2 33.8
## library_preparation.concentration_unit library_preparation.total_yield_value
## 1 nM 249.6
## 2 nM 170.0
## library_preparation.total_yield_unit library_preparation.cdna_pcr_cycles
## 1 ng 10
## 2 ng 10
## library_preparation.pcr_cycles_for_indexing
## 1 Not available
## 2 Not available
## file_id run_id
## 1 RFX_WT_Day7_GT23-05635_GCACGGAC-TGCGAGAC_S55_L002 20230424_23-scbct-028-run3
## 2 RFX_WT_Day6_GT23-05625_GACCTGAA-CTCACCAA_S97_L002 20230424_23-scbct-028-run3
## clonal.description clonal.parental_name clonal.clone_id clonal.type
## 1 KOLF2.2 KOLF2.2J WT iPSC
## 2 KOLF2.2 KOLF2.2J WT iPSC
## clonal.zygosity clonal.cell_line_generation_protocol
## 1 Not applicable Not applicable
## 2 Not applicable Not applicable
## clonal.treatment_condition clonal.wt_control_status
## 1 Not applicable WT
## 2 Not applicable WT
## expression_alteration.label model_organ
## 1 Not applicable Cortical brain (dorsal forebrain patterning)
## 2 Not applicable Cortical brain (dorsal forebrain patterning)
names(meta)## [1] "clonal.label"
## [2] "library_preparation.label"
## [3] "label"
## [4] "description"
## [5] "differentiated_product_protocol_id"
## [6] "model_system"
## [7] "timepoint_value"
## [8] "timepoint_unit"
## [9] "final_timepoint"
## [10] "treatment_condition"
## [11] "wt_control_status"
## [12] "ko_strategy"
## [13] "ko_gene"
## [14] "library_preparation.description"
## [15] "library_preparation.library_preparation_protocol_id"
## [16] "library_preparation.average_fragment_size"
## [17] "library_preparation.input_amount_value"
## [18] "library_preparation.input_amount_unit"
## [19] "library_preparation.concentration_value"
## [20] "library_preparation.concentration_unit"
## [21] "library_preparation.total_yield_value"
## [22] "library_preparation.total_yield_unit"
## [23] "library_preparation.cdna_pcr_cycles"
## [24] "library_preparation.pcr_cycles_for_indexing"
## [25] "file_id"
## [26] "run_id"
## [27] "clonal.description"
## [28] "clonal.parental_name"
## [29] "clonal.clone_id"
## [30] "clonal.type"
## [31] "clonal.zygosity"
## [32] "clonal.cell_line_generation_protocol"
## [33] "clonal.treatment_condition"
## [34] "clonal.wt_control_status"
## [35] "expression_alteration.label"
## [36] "model_organ"
table(meta$model_organ, meta$ko_gene)##
## FEZF2 LHX5 LHX9 OTX1 PAX6 RFX4
## Cortical brain (dorsal forebrain patterning) 36 36 9 27 9 36
##
## WT
## Cortical brain (dorsal forebrain patterning) 21
table(meta$run_id, meta$ko_gene)##
## FEZF2 LHX5 LHX9 OTX1 PAX6 RFX4 WT
## 20230228_23-scbct-008 0 0 0 0 9 0 1
## 20230424_23-scbct-028-run3 0 0 0 0 0 27 3
## 20230508_23-scbct-039-run2 27 0 0 0 0 0 3
## 20230522_23-scbct-052 0 27 0 0 0 0 3
## 20230824_23-scbct-092 0 0 9 0 0 0 1
## 20240131_23-robson-020 0 0 0 27 0 0 3
## 20240307_24-robson-003 0 0 0 0 0 9 1
## 20240307_24-robson-004 0 9 0 0 0 0 3
## 20240307_24-robson-006 9 0 0 0 0 0 3
table(meta$ko_strategy, meta$ko_gene)##
## FEZF2 LHX5 LHX9 OTX1 PAX6 RFX4 WT
## CE 12 12 3 9 3 12 0
## KO 12 12 3 9 3 12 0
## PTC 12 12 3 9 3 12 0
## WT 0 0 0 0 0 0 21
cts = fread(file.path(dat_path, "genesCounts.csv"), data.table = FALSE)
dim(cts)## [1] 38592 175
cts[1:2, c(1:2, (ncol(cts)-1):ncol(cts))]## Name RFX_CE_F5_Day8_GT23-05641_GTCGGAGC-TTATAACC_S73_L002
## 1 ENSG00000268674 0
## 2 ENSG00000271254 1521
## LHX5_CE_A1_Day3_GT23-07300_AAGTCCAA-TACTCATA_S163_L004
## 1 0
## 2 1270
## LHX5_KO_E4_Day5_GT23-07318_AACGTTCC-AGTACTCC_S166_L004
## 1 0
## 2 746
stopifnot(setequal(names(cts)[-1], meta$file_id))
meta = meta[match(names(cts)[-1], meta$file_id),]
table(names(cts)[-1] == meta$file_id)##
## TRUE
## 174
cts_dat = data.matrix(cts[,-1])
rownames(cts_dat) = cts$Namemodel_s = meta$model_system
table(model_s, useNA="ifany")## model_s
## CBO
## 174
get_q75 <- function(x){quantile(x, probs = 0.75)}
gene_info = data.frame(Name = cts$Name,
apply(cts_dat, 1, tapply, model_s, min),
apply(cts_dat, 1, tapply, model_s, median),
apply(cts_dat, 1, tapply, model_s, get_q75),
apply(cts_dat, 1, tapply, model_s, max))
dim(gene_info)## [1] 38592 5
gene_info[1:2, ]## Name apply.cts_dat..1..tapply..model_s..min.
## ENSG00000268674 ENSG00000268674 0
## ENSG00000271254 ENSG00000271254 426
## apply.cts_dat..1..tapply..model_s..median.
## ENSG00000268674 0
## ENSG00000271254 1318
## apply.cts_dat..1..tapply..model_s..get_q75.
## ENSG00000268674 0.00
## ENSG00000271254 1701.75
## apply.cts_dat..1..tapply..model_s..max.
## ENSG00000268674 3
## ENSG00000271254 3117
table(row.names(gene_info)==gene_info$Name, useNA="ifany")##
## TRUE
## 38592
names(gene_info)[2:5] = paste0(rep(c("CBO_"), times=4),
rep(c("min", "median", "q75", "max"), each=1))
dim(gene_info)## [1] 38592 5
gene_info[1:2,]## Name CBO_min CBO_median CBO_q75 CBO_max
## ENSG00000268674 ENSG00000268674 0 0 0.00 3
## ENSG00000271254 ENSG00000271254 426 1318 1701.75 3117
summary(gene_info[,-1])## CBO_min CBO_median CBO_q75 CBO_max
## Min. : 0.0 Min. : 0.0 Min. : 0.0 Min. : 0
## 1st Qu.: 0.0 1st Qu.: 0.0 1st Qu.: 0.0 1st Qu.: 2
## Median : 0.0 Median : 3.0 Median : 5.8 Median : 22
## Mean : 213.8 Mean : 699.1 Mean : 931.4 Mean : 1829
## 3rd Qu.: 89.0 3rd Qu.: 332.1 3rd Qu.: 474.6 3rd Qu.: 1003
## Max. :88969.0 Max. :489181.5 Max. :586205.0 Max. :1301990
table(gene_info$CBO_q75 > 0)##
## FALSE TRUE
## 14390 24202
w2kp = which(gene_info$CBO_q75 > 0)
cts_dat = cts_dat[w2kp,]
dim(cts_dat)## [1] 24202 174
gene_info = gene_info[w2kp,]gene_anno = fread("data/gencode_v44_primary_assembly_info.tsv")
dim(gene_anno)## [1] 62754 8
gene_anno[1:2,]## geneId chr strand start end ensembl_gene_id
## <char> <char> <char> <int> <int> <char>
## 1: ENSG00000000003.16 chrX - 100627108 100639991 ENSG00000000003
## 2: ENSG00000000005.6 chrX + 100584936 100599885 ENSG00000000005
## hgnc_symbol description
## <char> <char>
## 1: TSPAN6 tetraspanin 6 [Source:HGNC Symbol;Acc:HGNC:11858]
## 2: TNMD tenomodulin [Source:HGNC Symbol;Acc:HGNC:17757]
# all genes are contained in the annotation file
table(gene_info$Name %in% gene_anno$ensembl_gene_id)##
## TRUE
## 24202
setdiff(gene_info$Name, gene_anno$ensembl_gene_id)## character(0)
gene_info = merge(gene_anno, gene_info, by.x="ensembl_gene_id", by.y = "Name",
all.x = FALSE, all.y = TRUE)
dim(gene_info)## [1] 24202 12
gene_info[1:2,]## Key: <ensembl_gene_id>
## ensembl_gene_id geneId chr strand start end
## <char> <char> <char> <char> <int> <int>
## 1: ENSG00000000003 ENSG00000000003.16 chrX - 100627108 100639991
## 2: ENSG00000000005 ENSG00000000005.6 chrX + 100584936 100599885
## hgnc_symbol description CBO_min
## <char> <char> <int>
## 1: TSPAN6 tetraspanin 6 [Source:HGNC Symbol;Acc:HGNC:11858] 744
## 2: TNMD tenomodulin [Source:HGNC Symbol;Acc:HGNC:17757] 0
## CBO_median CBO_q75 CBO_max
## <num> <num> <int>
## 1: 2644.0 3235.00 5449
## 2: 9.5 20.75 97
gene_info[which(!gene_info$ensembl_gene_id%in%gene_anno$ensembl_gene_id)]## Key: <ensembl_gene_id>
## Empty data.table (0 rows and 12 cols): ensembl_gene_id,geneId,chr,strand,start,end...
gene_info[gene_info$CBO_min > 2e4, c("CBO_min", "hgnc_symbol")]## CBO_min hgnc_symbol
## <int> <char>
## 1: 22057 ACTB
## 2: 23713 HNRNPA1
## 3: 66180 EEF1A1
## 4: 23858 TUBB
## 5: 88639 MT-CO1
## 6: 29947 MT-ND4
## 7: 39231 MT-CO3
## 8: 88969 MALAT1
gene_info$gene_symbol = gene_info$hgnc_symbol
table(gene_info$gene_symbol == "")##
## FALSE TRUE
## 19133 5069
table(is.na(gene_info$gene_symbol))##
## FALSE
## 24202
wEns = which(gene_info$gene_symbol == "" | is.na(gene_info$gene_symbol))
gene_info$gene_symbol[wEns] = gene_info$ensembl_gene_id[wEns]
table(gene_info$gene_symbol == "")##
## FALSE
## 24202
table(is.na(gene_info$gene_symbol))##
## FALSE
## 24202
There is only one model system, CBO.
DESeq2 was run for different day groups separately:
day 3 + day 4, with a day indictor
day 6
day 7
day 8 (exclude samples from run ID short scbct-028-run3)
day 9
The specific DE groups can be found in the file with naming in the format:
${dataset_name}_DE_n_samples.tsv
release_name = "2024_06_JAX_RNAseq_Neuroectoderm"
result_dir = file.path("results", release_name)
processed_output_dir = file.path(result_dir, "processed")
all_result_files = list.files(path=processed_output_dir, pattern=".tsv",
all.files=TRUE, full.names=FALSE)
all_result_files = sort(all_result_files)
extract_cores <- function(x){
x_split = str_split(x, "_")[[1]]
x_start = 6
x_end = length(x_split)-1
x_d_group = paste(x_split[x_start:(x_end-1)], collapse="_")
x_gene = x_split[x_end]
return(c(x_d_group, x_gene))
}
df_file_info = NULL
for (x in all_result_files){
df_file_info = rbind(df_file_info, extract_cores(x))
}
df_file_info = as.data.frame(df_file_info)
colnames(df_file_info) = c("DE_general_group", "gene")
df_file_info## DE_general_group gene
## 1 CBO_day_3_4 LHX5
## 2 CBO_day_6 FEZF2
## 3 CBO_day_6 LHX5
## 4 CBO_day_6 RFX4
## 5 CBO_day_7 FEZF2
## 6 CBO_day_7 OTX1
## 7 CBO_day_7 PAX6
## 8 CBO_day_7 RFX4
## 9 CBO_day_8 FEZF2
## 10 CBO_day_8 OTX1
## 11 CBO_day_9 OTX1
## 12 CBO_day_9 RFX4
For this dataset, there is only one model system, and for DE group (day) and each knockout gene, there are three knockout strategies.
df_file_info$items = paste(df_file_info$DE_general_group,
df_file_info$gene, sep="_")
df_file_info$items## [1] "CBO_day_3_4_LHX5" "CBO_day_6_FEZF2" "CBO_day_6_LHX5" "CBO_day_6_RFX4"
## [5] "CBO_day_7_FEZF2" "CBO_day_7_OTX1" "CBO_day_7_PAX6" "CBO_day_7_RFX4"
## [9] "CBO_day_8_FEZF2" "CBO_day_8_OTX1" "CBO_day_9_OTX1" "CBO_day_9_RFX4"
table(table(df_file_info$items))##
## 1
## 12
fc0 = log2(1.5)
p0 = 0.05
gene_symbols = df_file_info$gene
gene_symbols## [1] "LHX5" "FEZF2" "LHX5" "RFX4" "FEZF2" "OTX1" "PAX6" "RFX4" "FEZF2"
## [10] "OTX1" "OTX1" "RFX4"
table(gene_symbols %in% gene_info$gene_symbol)##
## TRUE
## 12
genes_w_multi_ensembl = names(which(table(gene_info$gene_symbol)>1))
genes_w_multi_ensembl## [1] "LINC01238"
df = data.frame(KO = df_file_info$items,
group = df_file_info$DE_general_group,
gene_symbol = df_file_info$gene,
CE_nDE = NA, KO_nDE = NA, PTC_nDE = NA,
CE_padj = NA, CE_log2FoldChange = NA,
KO_padj = NA, KO_log2FoldChange = NA,
PTC_padj = NA, PTC_log2FoldChange = NA)
df$Model = str_extract(df$KO, "^[a-zA-Z0-9]+")
gene_ids = gene_info$ensembl_gene_id[match(df_file_info$gene, gene_info$gene_symbol)]
gene_ids## [1] "ENSG00000089116" "ENSG00000153266" "ENSG00000089116" "ENSG00000111783"
## [5] "ENSG00000153266" "ENSG00000115507" "ENSG00000007372" "ENSG00000111783"
## [9] "ENSG00000153266" "ENSG00000115507" "ENSG00000115507" "ENSG00000111783"
for (k in 1:nrow(df_file_info)){
res = fread(file.path(processed_output_dir,
paste0(release_name, "_", df_file_info$items[k], "_DEseq2.tsv")),
data.table = FALSE)
print(res[1:2,1:15])
gene_id = gene_ids[k]
w_gene = which(res$gene_ID == gene_id)
stopifnot(length(w_gene) == 1)
stopifnot(!(df$gene_symbol[k]%in%genes_w_multi_ensembl))
for(ko1 in c("CE", "KO", "PTC")){
fc = paste0(df$gene_symbol[k], "_", ko1, "_log2FoldChange")
padj = paste0(df$gene_symbol[k], "_", ko1, "_padj")
col_name = paste0(ko1, "_nDE")
df[k,col_name] = sum(abs(res[[fc]]) > fc0 & res[[padj]] < p0, na.rm = TRUE)
col_name = paste0(ko1, "_log2FoldChange")
df[k,col_name] = res[[fc]][w_gene]
col_name = paste0(ko1, "_padj")
df[k,col_name] = res[[padj]][w_gene]
}
}## gene_ID hgnc_symbol chr strand start end
## 1 ENSG00000271254 KI270711.1 - 4612 29626
## 2 ENSG00000276345 KI270721.1 + 2585 11802
## LHX5_CE_log2FoldChange LHX5_CE_pvalue LHX5_CE_padj LHX5_KO_log2FoldChange
## 1 0.0376 0.707 0.957 0.025
## 2 1.4400 0.628 NA 0.106
## LHX5_KO_pvalue LHX5_KO_padj LHX5_PTC_log2FoldChange LHX5_PTC_pvalue
## 1 0.802 1 0.0541 0.588
## 2 0.972 NA -0.3200 0.915
## LHX5_PTC_padj
## 1 0.974
## 2 0.996
## gene_ID hgnc_symbol chr strand start end
## 1 ENSG00000271254 KI270711.1 - 4612 29626
## 2 ENSG00000276345 KI270721.1 + 2585 11802
## FEZF2_CE_log2FoldChange FEZF2_CE_pvalue FEZF2_CE_padj FEZF2_KO_log2FoldChange
## 1 0.207 0.245 0.789 0.109
## 2 -3.060 0.549 NA -4.720
## FEZF2_KO_pvalue FEZF2_KO_padj FEZF2_PTC_log2FoldChange FEZF2_PTC_pvalue
## 1 0.538 1 0.183 0.302
## 2 0.356 NA -0.822 0.870
## FEZF2_PTC_padj
## 1 1
## 2 NA
## gene_ID hgnc_symbol chr strand start end
## 1 ENSG00000271254 KI270711.1 - 4612 29626
## 2 ENSG00000276345 KI270721.1 + 2585 11802
## LHX5_CE_log2FoldChange LHX5_CE_pvalue LHX5_CE_padj LHX5_KO_log2FoldChange
## 1 0.182 0.0947 1 0.0786
## 2 1.820 0.5700 NA 3.5000
## LHX5_KO_pvalue LHX5_KO_padj LHX5_PTC_log2FoldChange LHX5_PTC_pvalue
## 1 0.501 0.999 0.0715 0.506
## 2 0.302 NA 0.6550 0.838
## LHX5_PTC_padj
## 1 1
## 2 NA
## gene_ID hgnc_symbol chr strand start end
## 1 ENSG00000271254 KI270711.1 - 4612 29626
## 2 ENSG00000276345 KI270721.1 + 2585 11802
## RFX4_CE_log2FoldChange RFX4_CE_pvalue RFX4_CE_padj RFX4_KO_log2FoldChange
## 1 -0.133 0.460 1 -0.0429
## 2 3.990 0.462 NA -0.0978
## RFX4_KO_pvalue RFX4_KO_padj RFX4_PTC_log2FoldChange RFX4_PTC_pvalue
## 1 0.810 0.967 -0.13 0.469
## 2 0.986 NA 2.89 0.594
## RFX4_PTC_padj
## 1 1
## 2 NA
## gene_ID hgnc_symbol chr strand start end
## 1 ENSG00000271254 KI270711.1 - 4612 29626
## 2 ENSG00000277196 KI270734.1 - 138082 161852
## FEZF2_CE_log2FoldChange FEZF2_CE_pvalue FEZF2_CE_padj FEZF2_KO_log2FoldChange
## 1 -0.1590 0.372 0.686 -0.0756
## 2 -0.0632 0.822 0.935 0.1980
## FEZF2_KO_pvalue FEZF2_KO_padj FEZF2_PTC_log2FoldChange FEZF2_PTC_pvalue
## 1 0.672 1 -0.0741 0.678
## 2 0.482 1 0.3160 0.261
## FEZF2_PTC_padj
## 1 1
## 2 1
## gene_ID hgnc_symbol chr strand start end
## 1 ENSG00000271254 KI270711.1 - 4612 29626
## 2 ENSG00000277196 KI270734.1 - 138082 161852
## OTX1_CE_log2FoldChange OTX1_CE_pvalue OTX1_CE_padj OTX1_KO_log2FoldChange
## 1 0.434 0.0177 1 0.2230
## 2 -0.321 0.2610 1 -0.0552
## OTX1_KO_pvalue OTX1_KO_padj OTX1_PTC_log2FoldChange OTX1_PTC_pvalue
## 1 0.224 1 0.0815 0.658
## 2 0.846 1 -0.1100 0.698
## OTX1_PTC_padj
## 1 1
## 2 1
## gene_ID hgnc_symbol chr strand start end
## 1 ENSG00000271254 KI270711.1 - 4612 29626
## 2 ENSG00000277196 KI270734.1 - 138082 161852
## PAX6_CE_log2FoldChange PAX6_CE_pvalue PAX6_CE_padj PAX6_KO_log2FoldChange
## 1 0.0158 0.929 1 0.0621
## 2 0.0929 0.741 1 -0.0896
## PAX6_KO_pvalue PAX6_KO_padj PAX6_PTC_log2FoldChange PAX6_PTC_pvalue
## 1 0.726 1 -0.121 0.493
## 2 0.750 1 0.290 0.302
## PAX6_PTC_padj
## 1 1
## 2 1
## gene_ID hgnc_symbol chr strand start end
## 1 ENSG00000271254 KI270711.1 - 4612 29626
## 2 ENSG00000277196 KI270734.1 - 138082 161852
## RFX4_CE_log2FoldChange RFX4_CE_pvalue RFX4_CE_padj RFX4_KO_log2FoldChange
## 1 -0.1020 0.570 1 -0.000999
## 2 -0.0515 0.855 1 -0.022300
## RFX4_KO_pvalue RFX4_KO_padj RFX4_PTC_log2FoldChange RFX4_PTC_pvalue
## 1 0.996 1 -0.115 0.521
## 2 0.937 1 0.114 0.685
## RFX4_PTC_padj
## 1 1
## 2 1
## gene_ID hgnc_symbol chr strand start end
## 1 ENSG00000271254 KI270711.1 - 4612 29626
## 2 ENSG00000277196 KI270734.1 - 138082 161852
## FEZF2_CE_log2FoldChange FEZF2_CE_pvalue FEZF2_CE_padj FEZF2_KO_log2FoldChange
## 1 -0.0469 0.6230 1.000 0.0721
## 2 0.3150 0.0564 0.991 0.1090
## FEZF2_KO_pvalue FEZF2_KO_padj FEZF2_PTC_log2FoldChange FEZF2_PTC_pvalue
## 1 0.450 0.907 0.0336 0.725
## 2 0.514 0.924 0.0663 0.691
## FEZF2_PTC_padj
## 1 0.997
## 2 0.997
## gene_ID hgnc_symbol chr strand start end
## 1 ENSG00000271254 KI270711.1 - 4612 29626
## 2 ENSG00000277196 KI270734.1 - 138082 161852
## OTX1_CE_log2FoldChange OTX1_CE_pvalue OTX1_CE_padj OTX1_KO_log2FoldChange
## 1 0.0453 0.745 0.899 -0.070
## 2 -0.1620 0.517 0.777 0.169
## OTX1_KO_pvalue OTX1_KO_padj OTX1_PTC_log2FoldChange OTX1_PTC_pvalue
## 1 0.617 0.900 -0.2360 0.0934
## 2 0.500 0.853 -0.0473 0.8520
## OTX1_PTC_padj
## 1 0.755
## 2 0.982
## gene_ID hgnc_symbol chr strand start end
## 1 ENSG00000271254 KI270711.1 - 4612 29626
## 2 ENSG00000277196 KI270734.1 - 138082 161852
## OTX1_CE_log2FoldChange OTX1_CE_pvalue OTX1_CE_padj OTX1_KO_log2FoldChange
## 1 0.171 0.285 1 0.106
## 2 -0.201 0.470 1 0.236
## OTX1_KO_pvalue OTX1_KO_padj OTX1_PTC_log2FoldChange OTX1_PTC_pvalue
## 1 0.533 1 0.0774 0.630
## 2 0.420 1 -0.1980 0.478
## OTX1_PTC_padj
## 1 1
## 2 1
## gene_ID hgnc_symbol chr strand start end
## 1 ENSG00000271254 KI270711.1 - 4612 29626
## 2 ENSG00000277196 KI270734.1 - 138082 161852
## RFX4_CE_log2FoldChange RFX4_CE_pvalue RFX4_CE_padj RFX4_KO_log2FoldChange
## 1 0.2880 0.0774 1 -0.0742
## 2 -0.0357 0.8770 1 0.3340
## RFX4_KO_pvalue RFX4_KO_padj RFX4_PTC_log2FoldChange RFX4_PTC_pvalue
## 1 0.649 1 0.212 0.200
## 2 0.144 1 0.242 0.303
## RFX4_PTC_padj
## 1 1
## 2 1
print(df)## KO group gene_symbol CE_nDE KO_nDE PTC_nDE CE_padj
## 1 CBO_day_3_4_LHX5 CBO_day_3_4 LHX5 10 7 11 0.928
## 2 CBO_day_6_FEZF2 CBO_day_6 FEZF2 3 1 1 0.597
## 3 CBO_day_6_LHX5 CBO_day_6 LHX5 4 0 0 1.000
## 4 CBO_day_6_RFX4 CBO_day_6 RFX4 1 6 2 1.000
## 5 CBO_day_7_FEZF2 CBO_day_7 FEZF2 1 3 0 NA
## 6 CBO_day_7_OTX1 CBO_day_7 OTX1 2 1 4 1.000
## 7 CBO_day_7_PAX6 CBO_day_7 PAX6 0 0 0 1.000
## 8 CBO_day_7_RFX4 CBO_day_7 RFX4 0 0 0 1.000
## 9 CBO_day_8_FEZF2 CBO_day_8 FEZF2 0 5 0 1.000
## 10 CBO_day_8_OTX1 CBO_day_8 OTX1 84 104 6 0.243
## 11 CBO_day_9_OTX1 CBO_day_9 OTX1 37 40 44 1.000
## 12 CBO_day_9_RFX4 CBO_day_9 RFX4 37 56 33 1.000
## CE_log2FoldChange KO_padj KO_log2FoldChange PTC_padj PTC_log2FoldChange
## 1 0.455 1.00e+00 -1.120 0.8550 -2.0700
## 2 -2.280 1.09e-02 -6.260 1.0000 -0.0951
## 3 -0.576 9.99e-01 -0.295 1.0000 -0.7050
## 4 -0.609 9.50e-01 0.584 1.0000 -0.8300
## 5 -0.296 2.63e-06 -4.010 1.0000 1.6100
## 6 0.606 1.00e+00 0.468 1.0000 -0.0452
## 7 -0.567 1.00e+00 1.440 1.0000 -1.2700
## 8 -1.290 1.00e+00 -0.839 1.0000 -0.9720
## 9 -0.344 1.31e-30 -3.710 0.0907 0.8070
## 10 0.569 5.00e-01 0.447 0.9550 0.1370
## 11 0.613 1.00e+00 0.447 1.0000 0.3830
## 12 -1.160 2.69e-01 -2.670 1.0000 -1.2300
## Model
## 1 CBO
## 2 CBO
## 3 CBO
## 4 CBO
## 5 CBO
## 6 CBO
## 7 CBO
## 8 CBO
## 9 CBO
## 10 CBO
## 11 CBO
## 12 CBO
p1 <- ggplot(df, aes(x=(PTC_nDE), y=(CE_nDE),
label=gene_symbol, color=group)) +
geom_point() + scale_color_brewer(palette = "Dark2")+
geom_text_repel(point.padding = 0.5, min.segment.length = 0,
size = 3, # Adjust text size
box.padding = 0.5,
max.overlaps = 20, show.legend = FALSE) +
ggtitle("# of DE genes") +
labs(x="PTC", y="CE") +
geom_abline(intercept = 0, slope = 1, color = "orange", linetype = "dashed")
p2 <- ggplot(df, aes(x=(PTC_nDE), y=(KO_nDE),
label=gene_symbol, color=group)) +
geom_point() + scale_color_brewer(palette = "Dark2")+
geom_text_repel(point.padding = 0.5, min.segment.length = 0,
size = 3, # Adjust text size
box.padding = 0.5,
max.overlaps = 20, show.legend = FALSE) +
ggtitle("# of DE genes") +
labs(x="PTC", y="KO") +
geom_abline(intercept = 0, slope = 1, color = "orange", linetype = "dashed")
p3 <- ggplot(df, aes(x=CE_log2FoldChange, y=-log10(CE_padj),
label=gene_symbol, color=group)) +
geom_point() + scale_color_brewer(palette = "Dark2")+
geom_text_repel(point.padding = 0.5, min.segment.length = 0,
size = 3, # Adjust text size
box.padding = 0.5,
max.overlaps = 20, show.legend = FALSE) +
ggtitle("DE results of the KO genes, CE") +
labs(x="log2 fold change", y="-log10(adjusted p-value)") +
geom_hline(yintercept = 2, color = "grey", linetype = "dashed") +
geom_vline(xintercept = c(log2(1/1.5), log2(1.5)), color = "grey",
linetype = "dashed")
p4 <- ggplot(df, aes(x=KO_log2FoldChange, y=-log10(KO_padj),
label=gene_symbol, color=group)) +
geom_point() + scale_color_brewer(palette = "Dark2")+
geom_text_repel(point.padding = 0.5, min.segment.length = 0,
size = 3, # Adjust text size
box.padding = 0.5,
max.overlaps = 20, show.legend = FALSE) +
ggtitle("DE results of the KO genes, KO") +
labs(x="log2 fold change", y="-log10(adjusted p-value)") +
geom_hline(yintercept = 2, color = "grey", linetype = "dashed") +
geom_vline(xintercept = c(log2(1/1.5), log2(1.5)), color = "grey",
linetype = "dashed")
p5 <- ggplot(df, aes(x=PTC_log2FoldChange, y=-log10(PTC_padj),
label=gene_symbol, color=group)) +
geom_point() + scale_color_brewer(palette = "Dark2")+
geom_text_repel(point.padding = 0.5, min.segment.length = 0,
size = 3, # Adjust text size
box.padding = 0.5,
max.overlaps = 20, show.legend = FALSE) +
ggtitle("DE results of the KO genes, PTC") +
labs(x="log2 fold change", y="-log10(adjusted p-value)") +
geom_hline(yintercept = 2, color = "grey", linetype = "dashed") +
geom_vline(xintercept = c(log2(1/1.5), log2(1.5)), color = "grey",
linetype = "dashed")
g_combined <- ggarrange(p1, p2, p3, p4, p5, ncol = 2, nrow = 3)## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_text_repel()`).
print(g_combined)In some (day group, knockout gene, knockout strategy) combinations, there are too many DE genes and they cannot all be labeled in the volcano plots.
df_file_info## DE_general_group gene items
## 1 CBO_day_3_4 LHX5 CBO_day_3_4_LHX5
## 2 CBO_day_6 FEZF2 CBO_day_6_FEZF2
## 3 CBO_day_6 LHX5 CBO_day_6_LHX5
## 4 CBO_day_6 RFX4 CBO_day_6_RFX4
## 5 CBO_day_7 FEZF2 CBO_day_7_FEZF2
## 6 CBO_day_7 OTX1 CBO_day_7_OTX1
## 7 CBO_day_7 PAX6 CBO_day_7_PAX6
## 8 CBO_day_7 RFX4 CBO_day_7_RFX4
## 9 CBO_day_8 FEZF2 CBO_day_8_FEZF2
## 10 CBO_day_8 OTX1 CBO_day_8_OTX1
## 11 CBO_day_9 OTX1 CBO_day_9_OTX1
## 12 CBO_day_9 RFX4 CBO_day_9_RFX4
fc0## [1] 0.5849625
p0## [1] 0.05
for(k in 1:nrow(df_file_info)){
it_k = df_file_info$items[k]
d_group = df_file_info$DE_general_group[k]
gene_name = df_file_info$gene[k]
print(it_k)
res = fread(file.path(processed_output_dir,
paste0(release_name, "_", it_k, "_DEseq2.tsv")),
data.table = FALSE)
for (ko1 in c("CE", "KO", "PTC")){
res_k = res[,c(1, grep(ko1, names(res)))]
names(res_k) = gsub(paste0(gene_name, "_", ko1, "_"), "", names(res_k))
ww_up = which((res_k$log2FoldChange > fc0) & (res_k$padj < p0))
ww_down = which((res_k$log2FoldChange < ((-1)*fc0)) & (res_k$padj < p0))
res_k$DE = "No"
res_k$DE[ww_up] <- "Up"
res_k$DE[ww_down] <- "Down"
res_k$log2FoldChange[which(res_k$log2FoldChange > 3)] = 3
res_k$log2FoldChange[which(res_k$log2FoldChange < -3)] = -3
col2use = c("blue", "grey", "red")
names(col2use) = c("Down", "No", "Up")
res_k$delabel = NA
res_k$gene_symbol = gene_info$gene_symbol[match(res_k$gene_ID,
gene_info$ensembl_gene_id)]
w_de = which(res_k$DE != "No")
res_k$delabel[w_de] = res_k$gene_symbol[w_de]
print(table(res_k$DE))
if (gene_name=="EPAS1"){
d_group_in_title = d_group
}else{
d_group_in_title = gsub("_nor", "", d_group)
}
g2 = ggplot(res_k, aes(x=log2FoldChange, y=-log10(padj),
col=DE, label=delabel)) +
geom_point() + ggtitle(paste0(d_group_in_title, "\n", gene_name, "_", ko1)) +
geom_text_repel(point.padding = 0.5,
min.segment.length = 0,
size = 3, # Adjust text size
box.padding = 0.5,
max.overlaps = 20,
show.legend = FALSE) +
scale_color_manual(values=col2use)
print(g2)
}
}## [1] "CBO_day_3_4_LHX5"
##
## Down No Up
## 6 23232 4
## Warning: Removed 2138 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 23232 rows containing missing values or values outside the scale range
## (`geom_text_repel()`).
##
## Down No
## 7 23235
## Warning: Removed 1891 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 23235 rows containing missing values or values outside the scale range
## (`geom_text_repel()`).
##
## Down No Up
## 10 23231 1
## Warning: Removed 2332 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 23231 rows containing missing values or values outside the scale range
## (`geom_text_repel()`).
## [1] "CBO_day_6_FEZF2"
##
## Down No
## 3 23815
## Warning: Removed 2238 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 23815 rows containing missing values or values outside the scale range
## (`geom_text_repel()`).
##
## Down No
## 1 23817
## Warning: Removed 2277 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 23817 rows containing missing values or values outside the scale range
## (`geom_text_repel()`).
##
## No Up
## 23817 1
## Warning: Removed 2240 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Removed 23817 rows containing missing values or values outside the scale range
## (`geom_text_repel()`).
## [1] "CBO_day_6_LHX5"
##
## Down No Up
## 1 23814 3
## Warning: Removed 2856 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 23814 rows containing missing values or values outside the scale range
## (`geom_text_repel()`).
##
## No
## 23818
## Warning: Removed 2798 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 23818 rows containing missing values or values outside the scale range
## (`geom_text_repel()`).
##
## No
## 23818
## Warning: Removed 2815 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Removed 23818 rows containing missing values or values outside the scale range
## (`geom_text_repel()`).
## [1] "CBO_day_6_RFX4"
##
## No Up
## 23817 1
## Warning: Removed 2295 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 23817 rows containing missing values or values outside the scale range
## (`geom_text_repel()`).
##
## Down No Up
## 2 23812 4
## Warning: Removed 2293 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 23812 rows containing missing values or values outside the scale range
## (`geom_text_repel()`).
##
## No Up
## 23816 2
## Warning: Removed 2214 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 23816 rows containing missing values or values outside the scale range
## (`geom_text_repel()`).
## [1] "CBO_day_7_FEZF2"
##
## Down No
## 1 23664
## Warning: Removed 14223 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 23664 rows containing missing values or values outside the scale range
## (`geom_text_repel()`).
##
## Down No Up
## 2 23662 1
## Warning: Removed 1712 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 23662 rows containing missing values or values outside the scale range
## (`geom_text_repel()`).
##
## No
## 23665
## Warning: Removed 1738 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 23665 rows containing missing values or values outside the scale range
## (`geom_text_repel()`).
## [1] "CBO_day_7_OTX1"
##
## No Up
## 23663 2
## Warning: Removed 1669 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 23663 rows containing missing values or values outside the scale range
## (`geom_text_repel()`).
##
## No Up
## 23664 1
## Warning: Removed 1601 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 23664 rows containing missing values or values outside the scale range
## (`geom_text_repel()`).
##
## Down No Up
## 1 23661 3
## Warning: Removed 1746 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 23661 rows containing missing values or values outside the scale range
## (`geom_text_repel()`).
## [1] "CBO_day_7_PAX6"
##
## No
## 23665
## Warning: Removed 1971 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 23665 rows containing missing values or values outside the scale range
## (`geom_text_repel()`).
##
## No
## 23665
## Warning: Removed 1964 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Removed 23665 rows containing missing values or values outside the scale range
## (`geom_text_repel()`).
##
## No
## 23665
## Warning: Removed 1894 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Removed 23665 rows containing missing values or values outside the scale range
## (`geom_text_repel()`).
## [1] "CBO_day_7_RFX4"
##
## No
## 23665
## Warning: Removed 1872 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Removed 23665 rows containing missing values or values outside the scale range
## (`geom_text_repel()`).
##
## No
## 23665
## Warning: Removed 1924 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Removed 23665 rows containing missing values or values outside the scale range
## (`geom_text_repel()`).
##
## No
## 23665
## Warning: Removed 1900 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Removed 23665 rows containing missing values or values outside the scale range
## (`geom_text_repel()`).
## [1] "CBO_day_8_FEZF2"
##
## No
## 23000
## Warning: Removed 1468 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 23000 rows containing missing values or values outside the scale range
## (`geom_text_repel()`).
##
## Down No Up
## 1 22995 4
## Warning: Removed 5363 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 22995 rows containing missing values or values outside the scale range
## (`geom_text_repel()`).
##
## No
## 23000
## Warning: Removed 1573 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 23000 rows containing missing values or values outside the scale range
## (`geom_text_repel()`).
## [1] "CBO_day_8_OTX1"
##
## Down No Up
## 75 22916 9
## Warning: Removed 9812 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 22916 rows containing missing values or values outside the scale range
## (`geom_text_repel()`).
## Warning: ggrepel: 73 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
##
## Down No Up
## 46 22896 58
## Warning: Removed 8034 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 22896 rows containing missing values or values outside the scale range
## (`geom_text_repel()`).
## Warning: ggrepel: 89 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
##
## Down No
## 6 22994
## Warning: Removed 4919 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 22994 rows containing missing values or values outside the scale range
## (`geom_text_repel()`).
## [1] "CBO_day_9_OTX1"
##
## Down No Up
## 9 22886 28
## Warning: Removed 702 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 22886 rows containing missing values or values outside the scale range
## (`geom_text_repel()`).
## Warning: ggrepel: 20 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
##
## Down No Up
## 13 22883 27
## Warning: Removed 840 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 22883 rows containing missing values or values outside the scale range
## (`geom_text_repel()`).
## Warning: ggrepel: 14 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
##
## Down No Up
## 13 22879 31
## Warning: Removed 747 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 22879 rows containing missing values or values outside the scale range
## (`geom_text_repel()`).
## Warning: ggrepel: 26 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## [1] "CBO_day_9_RFX4"
##
## Down No Up
## 8 22886 29
## Warning: Removed 813 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 22886 rows containing missing values or values outside the scale range
## (`geom_text_repel()`).
## Warning: ggrepel: 3 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
##
## Down No Up
## 15 22867 41
## Warning: Removed 732 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 22867 rows containing missing values or values outside the scale range
## (`geom_text_repel()`).
## Warning: ggrepel: 22 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
##
## Down No Up
## 9 22890 24
## Warning: Removed 838 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 22890 rows containing missing values or values outside the scale range
## (`geom_text_repel()`).
## Warning: ggrepel: 9 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
gc()## used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells 7877604 420.8 12594078 672.6 NA 12594078 672.6
## Vcells 22822319 174.2 57575243 439.3 65536 47912703 365.6
sessionInfo()## R version 4.2.3 (2023-03-15)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Big Sur ... 10.16
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.2/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.2/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] grid stats4 stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] readxl_1.4.3 UpSetR_1.4.0
## [3] ComplexHeatmap_2.14.0 data.table_1.15.4
## [5] dplyr_1.1.4 ggpointdensity_0.1.0
## [7] viridis_0.6.5 viridisLite_0.4.2
## [9] DESeq2_1.38.3 SummarizedExperiment_1.28.0
## [11] Biobase_2.58.0 MatrixGenerics_1.10.0
## [13] matrixStats_1.3.0 GenomicRanges_1.50.2
## [15] GenomeInfoDb_1.34.9 IRanges_2.32.0
## [17] S4Vectors_0.36.2 BiocGenerics_0.44.0
## [19] RColorBrewer_1.1-3 MASS_7.3-60
## [21] stringr_1.5.1 ggpubr_0.6.0
## [23] ggrepel_0.9.5 ggplot2_3.5.1
##
## loaded via a namespace (and not attached):
## [1] bitops_1.0-8 bit64_4.0.5 doParallel_1.0.17
## [4] httr_1.4.7 tools_4.2.3 backports_1.5.0
## [7] bslib_0.8.0 utf8_1.2.4 R6_2.5.1
## [10] DBI_1.2.3 colorspace_2.1-1 GetoptLong_1.0.5
## [13] withr_3.0.1 tidyselect_1.2.1 gridExtra_2.3
## [16] bit_4.0.5 compiler_4.2.3 cli_3.6.3
## [19] DelayedArray_0.24.0 labeling_0.4.3 sass_0.4.9
## [22] scales_1.3.0 digest_0.6.37 rmarkdown_2.28
## [25] XVector_0.38.0 pkgconfig_2.0.3 htmltools_0.5.8.1
## [28] highr_0.11 fastmap_1.2.0 GlobalOptions_0.1.2
## [31] rlang_1.1.4 rstudioapi_0.16.0 RSQLite_2.3.7
## [34] farver_2.1.2 shape_1.4.6.1 jquerylib_0.1.4
## [37] generics_0.1.3 jsonlite_1.8.8 BiocParallel_1.32.6
## [40] car_3.1-2 RCurl_1.98-1.16 magrittr_2.0.3
## [43] GenomeInfoDbData_1.2.9 Matrix_1.5-4.1 Rcpp_1.0.13
## [46] munsell_0.5.1 fansi_1.0.6 abind_1.4-5
## [49] lifecycle_1.0.4 stringi_1.8.4 yaml_2.3.10
## [52] carData_3.0-5 zlibbioc_1.44.0 plyr_1.8.9
## [55] blob_1.2.4 parallel_4.2.3 crayon_1.5.3
## [58] lattice_0.22-6 cowplot_1.1.3 Biostrings_2.66.0
## [61] annotate_1.76.0 circlize_0.4.16 KEGGREST_1.38.0
## [64] locfit_1.5-9.10 knitr_1.48 pillar_1.9.0
## [67] rjson_0.2.21 ggsignif_0.6.4 geneplotter_1.76.0
## [70] codetools_0.2-20 XML_3.99-0.17 glue_1.7.0
## [73] evaluate_0.24.0 foreach_1.5.2 vctrs_0.6.5
## [76] png_0.1-8 cellranger_1.1.0 gtable_0.3.5
## [79] purrr_1.0.2 tidyr_1.3.1 clue_0.3-65
## [82] cachem_1.1.0 xfun_0.47 xtable_1.8-4
## [85] broom_1.0.6 rstatix_0.7.2 tibble_3.2.1
## [88] iterators_1.0.14 AnnotationDbi_1.60.2 memoise_2.0.1
## [91] cluster_2.1.6